3. Cleaning the dataset

This notebook will perform the process that was applied to the August's data in the previous notebook to the rest of months. So, if you're just reading the project you can skip this file and open the next notebook.

3.1 Preparing the notebook



In [1]:

    
%matplotlib inline
%config InlineBackend.figure_format='retina'



In [2]:

    
import os

import numpy as np
import pandas as pd



In [3]:

    
# VARIABLES

# Paths.
faredata_path = '../data/faredata/trip_fare_{0}.csv'
tripdata_path = '../data/tripdata/trip_data_{0}.csv'
cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'


# Tip percentage variables.
tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']
tip_perc_column_name = 'tip_perc'


# Coordinates variables.
min_x = -74.361107
min_y = 40.459518

max_x = -71.903083
max_y = 41.175342

step_x = 0.003205  # 270 meters (885 feet) approximately.
step_y = 0.002429  # 270 meters (885 feet) approximately.

# Number of maximum areas per axis in the 'area matrix'.
max_area_x = 766  # int((max_x - min_x) / step_x)
max_area_y = 294  # int((max_y - min_y) / step_y)

# Number of columns of the 'area matrix'.
num_columns = 767  # max_area_x + 1

# The area at the top right corner in the 'area matrix'.
max_area = 226264  # (max_area_y * num_columns) + max_area_x

# Minimum number of pickups or dropoffs per area.
min_area_number = 20

# Columns for creating the pickup and dropoff area attributes.
area_column_names = ['pickup_area', 'dropoff_area']
area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

# Order of the columns to apply it at before saving.
column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',
                'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',
                'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']



In [4]:

    
# FUNCTIONS

# Function for caculating the tip percentage of a trip.
def calculate_tip_perc(row):
    subtotal = row.fare_amount + row.surcharge + row.mta_tax
    tip = row.tip_amount / subtotal
    tip_perc = tip * 100
    
    return pd.Series({tip_perc_column_name: tip_perc})


# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.
def get_1d_area(coor, min_coor, step_coor):
    return int((coor - min_coor) / step_coor)

def get_area(area_x, area_y, columns):
    return (area_y * columns) + area_x

def calculate_area(row):
    pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)
    pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)
    pickup_area = get_area(pickup_x, pickup_y, num_columns)
    
    dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)
    dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)
    dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)
    
    return pd.Series({
        area_column_names[0]: pickup_area,
        area_column_names[1]: dropoff_area
    })


# This auxiliary function applies another one to every row in a DataFrame for creating new columns.
def iterate_and_apply(dataframe, function, necesary_columns):
    perform = True
    step = 100000
    start = 0
    to = step
    
    while perform:
        new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)
        if len(new_columns) == 0:
            perform = False
        else:
            dataframe.update(new_columns)
        new_columns = None
        start += step
        to += step
    
    return dataframe



In [5]:

    
# Creating the directory for saving the cleaned files if it doesn't exist.
if not os.path.exists('../data/cleaned/'):
    os.makedirs('../data/cleaned/')

3.2. Performing the process



In [6]:

    
# The range of files to clean.
months = range(1, 8) + range(9, 13)
for month in months:
    
    # Cleaning 'fare' data.
    
    data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)
    
    payment_type = (data.payment_type == 'CRD')
    fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))
    surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))
    mta_tax = (data.mta_tax == 0.5)
    tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))
    tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))

    data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]
    data.drop(['payment_type'], axis=1, inplace=True)
    
    payment_type = None
    fare_amount = None
    surcharge = None
    mta_tax = None
    tip_amount = None
    tolls_amount = None
    
    # Calculating the tip percentage.
    
    data[tip_perc_column_name] = np.nan
    data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)
    data.drop(['mta_tax'], axis=1, inplace=True)
    
    tip_perc = (data.tip_perc <= 50.0)
    data = data[tip_perc]
    tip_perc = None
    
    # Adding the trip data file.
    
    tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])
    
    for column in tripdata.columns:
        data[column] = np.nan
    
    '''
    Careful! The next line use a lot of memory, a least more than 8GB.
    It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:
        Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them
        into the data variable in a loop.
    '''
    data.update(tripdata)
    tripdata = None
    
    # Cleaning 'trip' data.
    
    rate_code = (data.rate_code == 1.0)
    passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))
    trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))
    trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))
    pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))
    pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))
    dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))
    dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))
    
    data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude
                & dropoff_latitude & dropoff_longitude]
    data.drop(['rate_code'], axis=1, inplace=True)
    
    rate_code = None
    passenger_count = None
    trip_time_in_secs = None
    trip_distance = None
    pickup_latitude = None
    pickup_longitude = None
    dropoff_latitude = None
    dropoff_longitude = None
    
    # Filtering the coordinates of unusal places.
    
    for column in area_column_names:
        data[column] = np.nan
    
    data = iterate_and_apply(data, calculate_area, area_necesary_columns)
    
    aux_pickup = data.groupby(['pickup_area']).size()
    aux_pickup = aux_pickup[aux_pickup >= min_area_number]
    aux_pickup = data['pickup_area'].isin(aux_pickup.index)
    
    aux_dropoff = data.groupby(['dropoff_area']).size()
    aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]
    aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)
    
    data = data[aux_pickup & aux_dropoff]
    data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)

    aux_pickup = None
    aux_dropoff = None
    
    # Saving the file.
    
    data = data.reindex_axis(column_order, axis=1)
    data.to_csv(cleaned_data_path.format(month), index=True)
    data = None